library(magrittr)
library(ggplot2)
library(gridExtra)
library(ggcorrplot)
source("../R/visualizations.R")
source("../R/feature_definitions.R")
training_set <- read.csv("../preprocessed_training_data.csv", row.names = 1, as.is = TRUE)
outcome <- read.csv("../training_outcomes.csv", row.names = 1)[,1]
stopifnot(row.names(training_set) == row.names(outcome))
features <- colnames(training_set)
Plot correlation matrices of missingness indicators against missingness indicators, observed values against observed values, and missingness indicators against observed values.
positive_data <- training_set[outcome == "positive", ]
negative_data <- training_set[outcome == "negative", ]
# Missingness indicator correlations
plot_missingness_correlations(training_set, numeric_features, "Missingness indicator correlations")
## Warning in cor(miss_data[, features]): the standard deviation is zero
plot_missingness_correlations(positive_data, numeric_features, "Missingness indicator correlations (positive-labeled)")
## Warning in cor(miss_data[, features]): the standard deviation is zero
plot_missingness_correlations(negative_data, numeric_features, "Missingness indicator correlations (negative-labeled)")
## Warning in cor(miss_data[, features]): the standard deviation is zero
# Observed value correlations
plot_observed_correlations(training_set, numeric_features, "Correlations of observed values")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero
plot_observed_correlations(positive_data, numeric_features, "Correlations of observed values (positive-labeled)")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero
plot_observed_correlations(negative_data, numeric_features, "Correlations of observed values (negative-labeled)")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero
# Missingness vs. observed correlations
plot_missingness_vs_observed_correlations(training_set, numeric_features, "Missingness correlations vs. observed values")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero
plot_missingness_vs_observed_correlations(positive_data, numeric_features, "Missingness correlations vs. observed values (positive-labeled)")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero
plot_missingness_vs_observed_correlations(negative_data, numeric_features, "Missingness correlations vs. observed values (negative-labeled)")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero
Next, plot distributions of each feature. Are they normal or linear?
feature_distribution_plots <- lapply(numeric_features,
function(column) {
ggplot2::quickplot(
na.omit(training_set[,column]),
main = column,
xlab = "",
bins = 30
)
})
marrangeGrob(
ncol = 2, nrow = 3,
grobs = feature_distribution_plots
)
They are not, and thus it might be worth considering data transformations. In the case of random forest, however, monotone transformations should have no effect.
Print (one-dimensional) contingency tables, i.e. occurence counts of each level of categorical variables.
for (cat_feat in categorical_features) {
table(training_set[, cat_feat, drop = FALSE], dnn = cat_feat, useNA = "always") %>% as.data.frame %>% print
}
## LRT_pred Freq
## 1 D 382
## 2 N 162
## 3 U 4
## 4 <NA> 2044
## Dst2SplType Freq
## 1 ACCEPTOR 115
## 2 DONOR 139
## 3 <NA> 2338
## Consequence.x Freq
## 1 3PRIME_UTR 22
## 2 5PRIME_UTR 17
## 3 DOWNSTREAM 458
## 4 INTRONIC 794
## 5 NON_SYNONYMOUS 580
## 6 NONCODING_CHANGE 13
## 7 SPLICE_SITE 73
## 8 UPSTREAM 635
## 9 <NA> 0
It is likely that missing values are more or less common in some variables depending on the predicted consequence. This can be visualized by a heatmap:
missing_value_sum_per_consequence <- lapply(training_set[, c(numeric_features, categorical_features), drop = FALSE],
function(column) {
sapply(
split(is.na(column), training_set$Consequence.x),
sum
)
})
missing_value_sum_per_consequence %<>% data.frame %>% as.matrix
heatmap(missing_value_sum_per_consequence)
Non-synonymous variants have much less missingness in certain variables and more in others (as expected).
missingness_patterns <- training_set[, c(numeric_features, categorical_features)] %>% is.na
unique_missingness_patterns <- missingness_patterns %>% unique
num_missingness_patterns <- unique_missingness_patterns %>% nrow
print(paste(num_missingness_patterns, "out of", 2^length(c(numeric_features, categorical_features)), "possible missingness patterns."))
## [1] "118 out of 72057594037927936 possible missingness patterns."
missingness_pattern_factor <- apply(missingness_patterns, MARGIN = 1, function(x) paste0(as.integer(x), collapse = "")) %>% factor
rows_per_missingness_pattern <- table(missingness_pattern_factor)
rows_per_missingness_pattern <- rows_per_missingness_pattern %>% as.data.frame
rows_per_missingness_pattern[order(rows_per_missingness_pattern$Freq, decreasing = TRUE),]